# this function loads an xls or xlsx file, makes sure that it is formatted
# correctly, adds year and month columns, does some preliminary cleaning, 
# and then returns formatted dataframe
format_data <- function(path){
  
  print(path)
  #browser()
  
  #read in excel document
  data <- read_excel(path)
  
  #some data exchange sheets have additional banner, so have to remove those 
  a <- 1
  while(!("Market Name" %in% names(data))){
    data <- read_excel(path, skip = a)
    a <- a + 1
  }
  
  path_split <- str_split(path, "/")[[1]]
  
  districts <- c("Balaka", "Blantyre", "Kasungu", "Lilongwe", "M'mbelwa",
                 "Mbelwa", "Mmbelwa", "Mzimba", "Machinga", "Mulanje", "Zomba")
  
  var_names <- names(data)
  
  #check to see whether district column is missing
  if(!("District" %in% names(data))){
    file_name_split <- str_split(path_split[length(path_split)], " ")[[1]]
    District <- file_name_split[which(file_name_split %in% districts)]
    data <- data.frame(District = District, data)
    
    var_names <- c("District", var_names)
  }
  
  #cut out unnecessary rows
  data <- data %>% filter(!is.na(`District`))
  
  #extract month and year from path
  month_year <- path_split[length(path_split) - 1] %>% str_split(" ") %>% unlist
  
  #add month and year back in
  data <- data.frame(year = month_year[2], 
                     month = month_year[1],
                     data)
  
  #recreate character NAs
  for(n in names(data)){
    if(is.character(data[, n])){
      data[, n] <- ifelse(data[, n] == "NA" | 
                            data[, n] == "N/A" |
                            data[, n] == "Not available" |
                            data[, n] == "Not availabe" |
                            data[, n] == "n/a",
                          NA, data[, n])
    }
  }
  
  #convert all columns to character, which makes binding possible
  data <- data %>% mutate_all(as.character)
  
  #add column labels back in
  names(data)[3:ncol(data)] <- var_names
  
  return(data)
  
}